#!/usr/bin/Rscript

library(DBI)
library(RSQLite)

# Functions

write_search_results <- function (ids, thread, path, filename, term_doc_matrix, n_results) {
  results <- sapply(ids, search_similar, term.doc.matrix.stm, n_results)
  results <- c(results)
  results <- c(results, ids)
  results <- unique(results)
  for (i in results) {
    title <- append(title, thread$title[thread$threadId==i])
    message <- append(message, thread$message[thread$threadId==i])
  }
  df <- data.frame(id = results, title = title, message = message)
  write.csv(df, file=paste0(path,filename,".csv"))
}

search_similar <-function (query_id, term_doc_matrix, n_results) {
  library(tm)
  library(slam)
  # Create two tdm based on query id
  doc_name <- paste0("doc",query_id)
  doc_names <- term_doc_matrix$dimnames$Docs
  query_index <- match(doc_name, doc_names)
  if (is.na(query_index)) stop("No match in corpus")
  # Corpus won't include query tdm
  query_tdm <- term_doc_matrix[,query_index]
  corpus_tdm <- term_doc_matrix[,-query_index]
  # Multiply transpose of query tdm by corpus_tdm
  score <- matprod_simple_triplet_matrix(t(query_tdm), corpus_tdm)
  score_df <- data.frame(threadId = gsub("doc","",colnames(score)),
                         score = as.vector(score))
  score_df <- score_df[order(score_df$score, decreasing = TRUE), ]
  return(as.character(score_df$threadId[1:n_results]))
}

firstnames_first <- function (name, name_dictionary) {
  name_dictionary <- tolower(as.character(name_dictionary[[1]]))
  name <- as.character(name)
  name <- tolower(name)
  name <- unlist(strsplit(name, " "))
  
  for (char in name) {
    if (char %in% name_dictionary) {
      return(char)
    }
    
  }
  return(NA)
}

sqLiteConnect <- function(database, table) {
  require(DBI)
  con <- dbConnect(RSQLite::SQLite(), dbname = database)
  query <- dbSendQuery(con, paste("SELECT * FROM ", table, ";", sep="")) 
  result <- fetch(query, n = -1)
  dbClearResult(query)
  dbDisconnect(con)
  return(result)
}

whatDupDelete <- function(id,comment_vector) {
  if (id %in% comment_vector) {
    return(FALSE)
  } else {
    return(TRUE)
  }
}

simpleCap <- function(string) {
  s <- strsplit(string, " ")[[1]]
  paste(toupper(substring(s, 1, 1)), substring(s, 2), sep = "", collapse = " ")
}

genderAttribution <- function(vector, dictionary) {
  require(stringr)
  
  # First name first
  vector <- sapply(vector, firstnames_first, dictionary)
  
  # Substring first word (first name)
  vector <- sapply(vector, word, 1)
  
  # Remove white spaces 
  vector <- gsub("-", " ", vector)
  
  # Capitalise first letter to match dictionary
  vector <- sapply(vector, simpleCap)
  
  # Create vector with matched gender
  dictionary[[2]][match(vector,dictionary[[1]])]
}

replaceCodingError <- function(vector) {
  vector <- gsub("Ã\u0083Æ\u0092Ã\u0082Â¨","è", vector)
  vector <- gsub("Â¨","è", vector)
  vector <- gsub("Ã¨","è", vector)
  vector <- gsub("Ã\u0083Æ\u0092Ã\u0082Â©","é", vector)
  vector <- gsub("Â©","é", vector)
  vector <- gsub("Ã©","é", vector)
  vector <- gsub("Ã\u0083Æ\u0092Ã\u008bâ\u0080","É", vector)
  vector <- gsub("Ã\u0083Æ\u0092Ã\u0082Â¬","ì", vector)
  vector <- gsub("Â¬","ì", vector)
  vector <- gsub("Ã¬","ì", vector)
  vector <- gsub("Ã\u0083Æ\u0092Ã\u0082Â²","ò", vector)
  vector <- gsub("Â²","ò", vector)
  vector <- gsub("Ã²","ò", vector)
  vector <- gsub("Ã\u0083Æ\u0092Ã\u0082Â¹","ù", vector)
  vector <- gsub("Â¹","ù", vector)
  vector <- gsub("Ã¹","ù", vector)
  vector <- gsub("Ã\u0083Æ\u0092Ã\u0082Â±","ñ", vector)
  vector <- gsub("Â±","ñ", vector)
  vector <- gsub("Ã±","ñ", vector)
  vector <- gsub("Ã\u0083â\u0080\u009aÃ\u0082Â´","’", vector)
  vector <- gsub("Ã\u0083Â¢Ã¢â\u0080\u009aÂ¬Ã¢â\u0080\u009eÂ¢","’", vector)
  vector <- gsub("â€™","’", vector)
  vector <- gsub("\nÃ\u0083â\u0080\u009aÃ\u0082Â«"," «", vector)
  vector <- gsub("Ã\u0083â\u0080\u009aÃ\u0082Â»"," »", vector)
  vector <- gsub("Ã\u0083Â¢Ã¢â\u0080\u009aÂ¬Ã\u0085â\u0080\u009c","\"", vector)
  vector <- gsub("â€œ","“", vector)
  vector <- gsub("â€\u009d","”", vector)
  vector <- gsub("Ã\u0083Â¢Ã¢â\u0082¬Å¡Ã\u0082Â¬","€", vector)
  vector <- gsub("â‚¬","€", vector)
  vector <- gsub("â€¢","•", vector)
  vector <- gsub("â€¢","•", vector)
  vector <- gsub("à·","-", vector)
  vector <- gsub("&nbsp;"," ", vector)
  vector <- gsub("Ã\u0083Æ\u0092Ã\u0082Â","à", vector)
  vector <- gsub("Ã","à", vector)
  vector <- gsub("Â","à", vector)
}

countToken <- function (string) {
  require(tm)
  length(scan_tokenizer(string))
}

firstAndLastName <- function(name, names) {
  name <- tolower(name)
  name <- unlist(strsplit(name, " "))
  len <- length(name)
  if (len == 1) {
    if (name[1] %in% names) return(list(first_name=name[1], last_name=NA))
  } else if (len == 2) {
    if (name[1] %in% names) {
      return(list(first_name=name[1], last_name=name[2]))
    } else {
      if (name[2] %in% names) {
        return(list(first_name=name[2], last_name=name[1]))
      } else {
        return(list(first_name=name[1], last_name=name[2]))
      }                      
    }
  } else {
    f_n = integer()
    for (i in 1:len) {
      if (name[i] %in% names) {
        f_n <- c(f_n, i)
      }
    }
    if (length(f_n)==0) {
      return(list(first_name=paste(name, collapse=" "), last_name=""))
    } 
    return(list(first_name=paste(name[f_n], collapse=" "), 
                last_name=paste(name[-f_n], collapse=" ")))
  }
}

name_first <- function (name, name_dictionary) {
  name_dictionary <- tolower(as.character(name_dictionary[[1]]))
  name <- as.character(name)
  name <- tolower(name)
  name <- unlist(strsplit(name, " "))
  if (length(name)<2) return(name)
  if (length(name)==2) return(paste(name[2],name[1],sep=" "))
  if (length(name)>2) return(firstnames_first(name, name_dictionary))
}

crossTableNameMatch <- function (string_name, vector_name, vector_id) {
  if (isComparable(string_name)) {
    matched_index <- which(toupper(vector_name)==toupper(string_name))
    if (!length(matched_index)==0) {
      return(vector_id[matched_index])
    } else {
      temp <- str_replace_all(string_name, "[^[:alnum:]]", " ")
      return(appendToString(tolower(temp), "NEWFK"))
    }
  } else {
    return(string_name)
  }
}

isComparable <- function (string)
  if ((countToken(string) >= 2)&!(grepl("\\d+", string))) {
    return(TRUE)
  } else { return(FALSE) 
  }

appendToString <- function (string, extension) {
  string <- paste(string, extension, sep=" ")
}

getSpamLabel <- function (text_vector, dictionary, threshold, wordLengths) {
  text_vector <- as.character(text_vector)
  corpus <- Corpus(VectorSource(text_vector))
  dtm <- DocumentTermMatrix(corpus, list(dictionary = dictionary, 
                                         wordLengths = c(wordLengths, Inf))
  )
  dtm.matrix <- as.matrix(dtm)
  dtm.sum <- rowSums(dtm.matrix)
  dtm.label <- as.numeric(dtm.sum > threshold)
}

sanitizeForXml <- function (string) {
  string <- as.character(XML::xmlTextNode(string))[6]
}

replaceLongUrl <- function(string,pattern,pattern_escaped,replacement){
  if (!is.na(pmatch(pattern,string))) {
    string <- gsub(pattern_escaped,replacement,string)
    return(string)
  } else {
    return(string)
  }
}

filterAuthorOut <- function(string,pattern) {
  if (grepl(pattern,string)) {
    string <- "THIS_IS_HAM"
    return(string)
  } else {
    return(string)
  }
} 

re.escape <- function(strings){
  vals <- c("\\\\", "\\[", "\\]", "\\(", "\\)", 
            "\\{", "\\}", "\\^", "\\$","\\*", 
            "\\+", "\\?", "\\.", "\\|")
  replace.vals <- paste0("\\\\", vals)
  for(i in seq_along(vals)){
    strings <- gsub(vals[i], replace.vals[i], strings)
  }
  strings
}

Mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

appearsMoreThanOnce <- function (string, vector) {
  logical <- vector %in% string
  if (sum(logical==TRUE)>1) {
    return(TRUE)
  } else {
    return(FALSE)
  }
}

sanitizeNegativeString <- function (string) {
  if (string<0) {
    new_str <- ""
    character_vector <- unlist(strsplit(string,""))
    i <- 1; N <- length(character_vector)
    while (i <= N) {
      if (character_vector[i]>0) {
        new_str <- paste(character_vector[i:N],sep="",collapse="")
        break
      }
      i <- i + 1
    }  
    if (new_str=="") {
      rdm <- as.character(sample(1:999999,1))
      new_str <- paste("negSan",rdm,sep = "")
    }
    return(new_str)
  } else {
    return (string)
  }
}

# Define source of data
primary_database <- "m5s_forum_apr2015.sqlite"

# Import tables from database
author_alpha <- sqLiteConnect(primary_database, "author")
author_beta <- sqLiteConnect(primary_database, "commentAuthor")
comment <- sqLiteConnect(primary_database, "comment")
thread <- sqLiteConnect(primary_database, "thread")

